In [1]:
import pandas as pd
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import scipy.stats as stats
import statsmodels.stats.multicomp as mult
import re
import warnings

warnings.filterwarnings('ignore')
In [2]:
df=pd.read_csv('philosophy_data.csv')
df_new=df[['author','sentence_str']]
df1=df_new.loc[df_new['author']=='Aristotle']

histogram function

In [4]:
import pandas as pd
import nltk.corpus
#nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')+['.','it','a']
df1['sentence_str_lower'] = df1['sentence_str'].str.lower()
df1['clean'] = df1['sentence_str_lower'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
word_list=df1['clean'].values.tolist()

# Assign the Counter instance `most_common` call to a variable:
word_frequency = Counter(" ".join(word_list).split()).most_common(10)

# `most_common` returns a list of (word, count) tuples
words = [word for word, _ in word_frequency]
counts = [counts for _, counts in word_frequency]

plt.bar(words, counts)
plt.title("10 most frequent words in Aristotle")
plt.ylabel("Frequency")
plt.xlabel("Words")
plt.show()
In [5]:
df2=df_new.loc[df_new['author']=='Ricardo']

df2['sentence_str_lower'] = df2['sentence_str'].str.lower()

stop_words = stopwords.words('english')+['.','it','a','the']
df2['clean'] = df2['sentence_str_lower'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

word_list2=df2['clean'].values.tolist()

# Assign the Counter instance `most_common` call to a variable:
word_frequency = Counter(" ".join(word_list2).split()).most_common(10)

# `most_common` returns a list of (word, count) tuples
words = [word for word, _ in word_frequency]
counts = [counts for _, counts in word_frequency]

plt.bar(words, counts)
plt.title("10 most frequent words for Ricardo")
plt.ylabel("Frequency")
plt.xlabel("Words")
plt.show()
In [6]:
df3=df_new.loc[df_new['author']=='Plato']
df3['sentence_str_lower'] = df3['sentence_str'].str.lower()
stop_words = stopwords.words('english')+['.','it','a','the']
df3['lower'] = df3['sentence_str_lower'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df3['clean'] = df3['lower'].apply(lambda x: ' '.join([word for word in x.split() if word.isalnum()]))
word_list3=df3['clean'].values.tolist()
# Assign the Counter instance `most_common` call to a variable:
word_frequency = Counter(" ".join(word_list3).split()).most_common(10)

# `most_common` returns a list of (word, count) tuples
words = [word for word, _ in word_frequency]
counts = [counts for _, counts in word_frequency]

plt.bar(words, counts)
plt.title("10 most frequent words for Plato")
plt.ylabel("Frequency")
plt.xlabel("Words")
plt.show()

Word cloud function

In [6]:
from wordcloud import WordCloud
# for preprocessing
import nltk
#from nltk.probability import FreqDist
from nltk.corpus import stopwords
#from nltk import word_tokenize
#from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.feature_extraction.text import TfidfTransformer
#from sklearn.cluster import MiniBatchKMeans, KMeans
#from sklearn.decomposition import PCA
#from sklearn.manifold import TSNE

import pprint
pp = pprint.PrettyPrinter()
import collections
from collections import Counter

# Data Cleaning 
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
#nltk.download('wordnet')
nltk_stopwords = stopwords.words('english')
comment_words = ''
#stopwords = set(STOPWORDS)
text_1 = " ".join(text for text in word_list)
print ("There are {} words in the combination of all cells in column Text.".format(len(text_1)))

# Generate a word cloud image

wordcloud = WordCloud(stopwords =nltk_stopwords, background_color="white", width=600, height=300,max_words=1000).generate(text_1)

# Display the generated image:

# set the figsize
plt.figure(figsize=[15,10])
# plot the wordcloud
plt.imshow(wordcloud, interpolation="bilinear")
# remove plot axes
plt.axis("off")
# save the image
plt.savefig('Aristotle_wc.png')
There are 4576959 words in the combination of all cells in column Text.
In [7]:
#stopwords = set(STOPWORDS)
text_2 = " ".join(text for text in word_list2)
print ("There are {} words in the combination of all cells in column Text.".format(len(text_2)))

# Generate a word cloud image

wordcloud = WordCloud(stopwords =nltk_stopwords, background_color="white", width=600, height=300,max_words=1000).generate(text_2)

# Display the generated image:

# set the figsize
plt.figure(figsize=[15,10])
# plot the wordcloud
plt.imshow(wordcloud, interpolation="bilinear")
# remove plot axes
plt.axis("off")
# save the image
plt.savefig('Ricardo_wc.png')
There are 375275 words in the combination of all cells in column Text.
In [8]:
text_3 = " ".join(text for text in word_list3)
print ("There are {} words in the combination of all cells in column Text.".format(len(text_3)))

# Generate a word cloud image

wordcloud = WordCloud(stopwords =nltk_stopwords, background_color="white", width=600, height=300,max_words=1000).generate(text_3)

# Display the generated image:

# set the figsize
plt.figure(figsize=[15,10])
# plot the wordcloud
plt.imshow(wordcloud, interpolation="bilinear")
# remove plot axes
plt.axis("off")
# save the image
plt.savefig('Plato_wc.png')
There are 1899821 words in the combination of all cells in column Text.

Word cloud and histogram for different schools of philosophy

In [9]:
df=pd.read_csv('/Users/xuefei/Desktop/philosophy_data.csv')
df.head(10)
df4=df[['school','sentence_str']]
df4.head(10)
df5=df4.loc[df4['school']=='aristotle']
df5['sentence_str_lower'] = df5['sentence_str'].str.lower()
/var/folders/7f/k8flrtsj7hj5bcvsk6xyz27c0000gn/T/ipykernel_3750/2682837144.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['sentence_str_lower'] = df5['sentence_str'].str.lower()
In [10]:
df5['sentence_str_lower'] = df5['sentence_str'].str.lower()
stop_words = stopwords.words('english')+['.','it','a','the']
df5['lower'] = df5['sentence_str_lower'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df5['clean'] = df5['lower'].apply(lambda x: ' '.join([word for word in x.split() if word.isalnum()]))
word_list5=df5['clean'].values.tolist()
# Assign the Counter instance `most_common` call to a variable:
word_frequency = Counter(" ".join(word_list5).split()).most_common(10)

# `most_common` returns a list of (word, count) tuples
words = [word for word, _ in word_frequency]
counts = [counts for _, counts in word_frequency]

plt.bar(words, counts)
plt.title("10 most frequent words for Plato")
plt.ylabel("Frequency")
plt.xlabel("Words")
plt.show()
/var/folders/7f/k8flrtsj7hj5bcvsk6xyz27c0000gn/T/ipykernel_3750/2734599636.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['sentence_str_lower'] = df5['sentence_str'].str.lower()
/var/folders/7f/k8flrtsj7hj5bcvsk6xyz27c0000gn/T/ipykernel_3750/2734599636.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['lower'] = df5['sentence_str_lower'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
/var/folders/7f/k8flrtsj7hj5bcvsk6xyz27c0000gn/T/ipykernel_3750/2734599636.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['clean'] = df5['lower'].apply(lambda x: ' '.join([word for word in x.split() if word.isalnum()]))
In [11]:
text_5 = " ".join(text for text in word_list5)
print ("There are {} words in the combination of all cells in column Text.".format(len(text_5)))

# Generate a word cloud image

wordcloud = WordCloud(stopwords =nltk_stopwords, background_color="white", width=600, height=300,max_words=1000).generate(text_5)

# Display the generated image:

# set the figsize
plt.figure(figsize=[15,10])
# plot the wordcloud
plt.imshow(wordcloud, interpolation="bilinear")
# remove plot axes
plt.axis("off")
# save the image
plt.savefig('Aristotle_wc.png')
There are 3266912 words in the combination of all cells in column Text.
In [12]:
df6 =df4.loc[df4['school']=='plato']
df6['sentence_str_lower'] = df6['sentence_str'].str.lower()
stop_words = stopwords.words('english')+['.','it','a','the']
df6['lower'] = df6['sentence_str_lower'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df6['clean'] = df6['lower'].apply(lambda x: ' '.join([word for word in x.split() if word.isalnum()]))
word_list6=df6['clean'].values.tolist()
# Assign the Counter instance `most_common` call to a variable:
word_frequency = Counter(" ".join(word_list6).split()).most_common(10)

# `most_common` returns a list of (word, count) tuples
words = [word for word, _ in word_frequency]
counts = [counts for _, counts in word_frequency]

plt.bar(words, counts)
plt.title("10 most frequent words for Plato")
plt.ylabel("Frequency")
plt.xlabel("Words")
plt.show()
/var/folders/7f/k8flrtsj7hj5bcvsk6xyz27c0000gn/T/ipykernel_3750/2107918592.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6['sentence_str_lower'] = df6['sentence_str'].str.lower()
/var/folders/7f/k8flrtsj7hj5bcvsk6xyz27c0000gn/T/ipykernel_3750/2107918592.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6['lower'] = df6['sentence_str_lower'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
/var/folders/7f/k8flrtsj7hj5bcvsk6xyz27c0000gn/T/ipykernel_3750/2107918592.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6['clean'] = df6['lower'].apply(lambda x: ' '.join([word for word in x.split() if word.isalnum()]))
In [13]:
text_6 = " ".join(text for text in word_list6)
print ("There are {} words in the combination of all cells in column Text.".format(len(text_6)))

# Generate a word cloud image

wordcloud = WordCloud(stopwords =nltk_stopwords, background_color="white", width=600, height=300,max_words=1000).generate(text_6)

# Display the generated image:

# set the figsize
plt.figure(figsize=[15,10])
# plot the wordcloud
plt.imshow(wordcloud, interpolation="bilinear")
# remove plot axes
plt.axis("off")
# save the image
plt.savefig('Plato_wc.png')
There are 1899821 words in the combination of all cells in column Text.
In [14]:
df7 =df4.loc[df4['school']=='capitalism']
df7['sentence_str_lower'] = df7['sentence_str'].str.lower()
stop_words = stopwords.words('english')+['.','it','a','the']
df7['lower'] = df7['sentence_str_lower'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df7['clean'] = df7['lower'].apply(lambda x: ' '.join([word for word in x.split() if word.isalnum()]))
word_list7=df7['clean'].values.tolist()
# Assign the Counter instance `most_common` call to a variable:
word_frequency = Counter(" ".join(word_list7).split()).most_common(10)

# `most_common` returns a list of (word, count) tuples
words = [word for word, _ in word_frequency]
counts = [counts for _, counts in word_frequency]

plt.bar(words, counts)
plt.title("10 most frequent words for Capitalism")
plt.ylabel("Frequency")
plt.xlabel("Words")
plt.show()
/var/folders/7f/k8flrtsj7hj5bcvsk6xyz27c0000gn/T/ipykernel_3750/105575850.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df7['sentence_str_lower'] = df7['sentence_str'].str.lower()
/var/folders/7f/k8flrtsj7hj5bcvsk6xyz27c0000gn/T/ipykernel_3750/105575850.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df7['lower'] = df7['sentence_str_lower'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
/var/folders/7f/k8flrtsj7hj5bcvsk6xyz27c0000gn/T/ipykernel_3750/105575850.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df7['clean'] = df7['lower'].apply(lambda x: ' '.join([word for word in x.split() if word.isalnum()]))
In [15]:
text_7 = " ".join(text for text in word_list7)
print ("There are {} words in the combination of all cells in column Text.".format(len(text_7)))

# Generate a word cloud image

wordcloud = WordCloud(stopwords =nltk_stopwords, background_color="white", width=600, height=300,max_words=1000).generate(text_7)

# Display the generated image:

# set the figsize
plt.figure(figsize=[15,10])
# plot the wordcloud
plt.imshow(wordcloud, interpolation="bilinear")
# remove plot axes
plt.axis("off")
# save the image
plt.savefig('Plato_wc.png')
There are 1648398 words in the combination of all cells in column Text.
In [ ]: